SAS <-> R :: CHEAT SHEET 


Introduction 


This guide aims to familiarise SAS users with R. 
R examples make use of tidyverse collection of packages. 


Install tidyverse: 
Attach tidyverse packages for use: 


install.packages("tidyverse") 
library(tidyverse) 


R data here in ‘data frames’, and occasionally vectors (via ¢( ) ) 
Other R structures (lists, matrices...) are not explored here. 


Keyboard shortcuts: <- Alt+- %>% Ctrl + Shift +m 


Datasets; drop, keep & rename variables 


data new_data; new_data <- old_data 
set old_data; 


run; 


data new_data (keep=id); new_data <- old_data %>% 
set old_data (drop=job_title);  select(-job_title) %>% 
run; select(id) 


new_data <- old_data %>% 
select( -starts_with("temp") 


data new_data (drop= temp: ); 
set old_data; 
run; C.f. contains( ), ends_with( ) 


data new_data; new_data <- old_data %>% 


set old_data; rename(new_name = old_name) 
rename old_name = new_name; 
run; Note order differs 


Conditional filtering 


data new_data; new_data <- old_data %>% 


set old_data; filter(Sex == "M") 
if Sex = "M"; 
run; 


new_data <- old_data %>% 
filter(year %in% c(2010,2011,2012)) 


data new_data; 

set old_data; 

if year in (2010,2011,2012); 
run; 


data new_data; new_data <- old_data %>% 


set old_data; group _by( id) %>% 
by id; slice(1) 
if first.id ; 

run; 


Could use slice(n( )) for last 


data new_data; new_data <- old_data %>% 


set old_data; filter(dob > as.Date("1990-04-25")) 
if dob > "25APR1990"d; 
run; 


New variables, conditional editing 


new_data <- old_data %>% 
mutate(total_income = wages + benefits) 


data new_data; 

set old_data; 

total_income = wages + benefits ; 
run; 


new_data <- old_data %>% 
mutate(full_time = if_else(hours > 30, "Y" , "N")) 


data new_data; 
set old_data; 
if hours > 30 then full_time = "Y"; 
else full_time = "N"; 

run; 


new_data <- old_data %>% 
mutate(weather = case_when( 
temp > 20 ~ "Warm", 
temp > 10 ~"Mild", 
TRUE ~ "Cold" ) ) 


data new_data; 
set old_data; 
if temp > 20 then weather = "Warm"; 
else if temp > 10 then weather = "Mild"; 
else weather = "Cold"; 

run; 


Counting and Summarising 


old_data %>% 
count( job_type ) 


proc freq data = old_data ; 
table job_type; 


. For percent, add: 
run; 


%>% mutate(percent = n*100/sum(n)) 


old_data %>% 
count( job_type , region ) 


proc freq data = old_data ; 
table job_type*region ; 
run; 


proc summary data = old_data nway; new_data <- old_data %>% 
class job_type region ; 


group_by( job_type, region ) %>% 
summarise( Count = n() ) 
run; 


output out = new_data ; 
Equivalent without nway not trivially produced 


new_data <- old_data %>% 
group_by( job_type , region ) %>% 
summarise( total_salaries = sum( salary ) , 
Count = n() ) 


proc summary data = old_data nway ; 
class job_type region ; 
var salary ; 
output out = new_data 
sum( salary ) = total_salaries ; 


rar Lots of summary functions in both languages 


Swap summarise( ) for mutate( ) to add summary data to original data 
Combining datasets 


data new_data ; new_data <- bind_rows( data_1 , data_2 ) 
set data_1 data_2; 
runy; C.f. rbind( ) which produces error if columns are not identical 
data new_data ; 
merge data_1 (in= in_1) data_2; 
by id ; 
if in1; 
run; C.f. full_join( ), right_join(), inner_join() 


new_data <- left_join( data_1 , data_2 , by = "id") 


Some plotting in R 


ggplot( my_data , aes( year , sales ) ) + 
geom_point( ) + geom_line( ) 
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ggplot( my_data , aes( year , sales ) ) + 
geom_point( ) + geom_line( ) + ylim(0, 40) + 
labs(x = "" , y = "Sales per year") 
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ggplot(my_data, aes( year, sales, colour = dept) ) + 
geom_point( ) + geom_line( ) 
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ggplot( my_data , aes( year, sales, fill = dept) ) + 
geom_col() 
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Note ‘colour’ for lines & points, ‘fill’ for shapes 


ggplot( my_data , aes( year, sales, fill = dept) ) + 
geom_col( position = "dodge" ) + coord_flip( ) 
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C.f. position = "fill" for 100% stacked bars/cols 
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Sorting and Row-Wise Operations 


new_data <- old_data %>% 
arrange( id , desc( income ) ) 


proc sort data=old_data out=new_data; 
by id descending income ; 
run; 


proc sort data=old_data nodup; old_data <- old_data %>% 
by id job_type; arrange( id , job_type)) %>% 
run; distinct( ) 


Note nodup relies on adjacency of duplicate rows, distinct( ) does not 


proc sort data=old_data nodupkey; old_data <- old_data %>% 


by id; arrange( id ) %>% 
run; group _by( id) %>% 
slice( 1 ) 


new_data <- old_data %>% 
group_by( id ) %>% 
slice(which.max( income )) 


data new_data; 
set old_data; 
by id descending income ; 
if first.id ; 

run; 


C.f.which.min() 
Swap to preserve duplicate maxima: ... slice_max( income ) 
Alternatively: ... filter(income==max(income)) 


data new_data; new_data <- old_data %>% 


set old_data; mutate( prev_id = lag( id , 1 )) 
prev_id= lag( id ); 
run; 


C.f. lead( ) for subsequent rows 


data new_data; new_data <- old_data %>% 
set old_data; group_by( id ) %>% 
by id; mutate( counter = row_number( ) ) 
counter +1; 
if first.id then counter = 1; 
run; 


Converting and Rounding 


new_data <- old_data %>% 
mutate(num_var = as.numeric("5" )) %>% 
mutate(text_var = as.character( 5 )) 


data new_data; 
set old_data ; 
num_var = input("5" , 8. ); 
text_var = put(5, 8. ); 
run; 


new_data <- old_data %>% 
mutate(nearest_5 = round(x/5)*5) %>% 
mutate(two_decimals = round( x , digits = 2) 


data new_data ; 
set old_data; 
nearest_5 = round( x, 5) 
two_decimals = round( x , 0.01) 
run; 


Creating functions to modify datasets 


add variable <- function( dataset_name ){ 
dataset_name <- dataset_name %>% 
mutate(new_variable = 1) 
return( dataset_name ) 


%macro add _variable(dataset_name); 
data &dataset_name; 
set &dataset_name; 
new_variable = 1; 
run; } 
%mend; my_data <- add_variable( my_data ) 


9 H . 
%add_variable( my_data ); Note SAS can modify within the macro, 


whereas R creates a copy within the function 


String Manipulation 


data new_data; 

set old_data; 

if find( job_title , "Health" ); 
run; 


data new_data; 

set old_data; 

substring = substr( big_string , 3, 4); 
run; 


data new_data; 
set old_data; 


new_data <- old_data %>% 
filter( str_detect( job_title , "Health" )) 


new_data <- old_data %>% 
mutate( substring = str_sub( big_string , 3, 6 )) 


Returns characters 3 to 6. Note SAS uses <start>, <length>, R uses <start>, <end> 


new_data <- old_data %>% 
mutate( address = str_replace_all( address , "Street" , "St" )) 


address = tranwrd( address , "Street" , "St" ); 


run; 


data new_data; 


C.f. str_replace( ) for first instance of pattern only 


new_data <- old_data %>% 
mutate( house_number = str_extract( address , "\\d+" )) 


Wide range of regexps in both languages, this example extracts digits only 


set old_data; 

house_number = compress( address , , "dk" ); 
run; 
Transpose/Pivot 


proc transpose data=long_data out=wide_data; 


by student e 
id subject ; | 
var grade ; Add NOTSORTED if long_data 


run; 


wide_data <- long_data %>% 
pivot_wider(names_from = subject , values _from = grade) 


is not sorted by student 


proc transpose data=wide_data 


long_data <- wide_data %>% 


out=long_data(rename=(col1=grade)) name=subject; pivot_longer(c(English, Irish, Maths) , 


by student ; 
var English Irish Maths; 
run; 


File operations 


Operate in ‘Work’ library. 
Use libname to define file locations 


libname library_name "file_location"; 
data library_name.saved_data; 

set data_in_use; 
run; 


libname library_name "“file_location"; 
data data_in_use; 

set library_name.saved_data ; 
run; 


proc export data = my_data 


names_to = "subject", values_to = "grade") 


Operate in a particular ‘working directory’ (identify using getwd( ) ) 
Move to other locations using setwd( ) 


saveRDS(data_in_use, file="file_location/saved_data.rds") 
or 

setwd("file_location") 

saveRDS( data_in_use, file = "saved_data.rds") 


data_in_use <- readRDS("file_location/saved_data.rds" ) 
or 

setwd("file_location") 

data_in_use <- readRDS("saved_data.rds") 


write_csv(my_data , “my_file.csv") 


outfile = "my_file.csv" dbms = csv replace; 


run; 


proc import datafile = “my_file.csv" 
out = my_data dbms = csv; 
run; 


my_data <- read_csv("my_file.csv") 


Both examples assume column headers in csv file 
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